if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.2 (2023-10-31)
BiocManager::install(c("AnnotationHub", "ensembldb", "tidyverse", "readr")) 
'getOption("repos")' replaces Bioconductor standard repositories, see 'help("repositories",
package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com/
Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.2 (2023-10-31)
Warning: package(s) not installed when version(s) same as or greater than current; use `force = TRUE` to
  re-install: 'AnnotationHub' 'ensembldb' 'tidyverse' 'readr'Old packages: 'ggtree'
Update all/some/none? [a/s/n]: 
a
Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/data/annotation/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/data/annotation/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/data/experiment/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/data/experiment/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/workflows/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/workflows/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/books/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 0Warning: cannot open URL 'https://bioconductor.org/packages/3.18/books/bin/macosx/big-sur-arm64/contrib/4.3/PACKAGES.gz': HTTP status was '404 Not Found'trying URL 'https://bioconductor.org/packages/3.18/bioc/bin/macosx/big-sur-arm64/contrib/4.3/ggtree_3.10.0.tgz'
Content type 'application/x-gzip' length 920412 bytes (898 KB)
==================================================
downloaded 898 KB

The downloaded binary packages are in
    /var/folders/wq/m1c800752kjg8rn4n8q641tr0000gn/T//RtmpOw9eUP/downloaded_packages
setwd("~/Desktop/Lab/Working_Projects/trapseq_2023")
library(readr)
OG_DEGFILE<- read_delim(file="deg_analysis_DESeq_all.csv", delim = ",")
New names:Rows: 22053 Columns: 8── Column specification ─────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): row
dbl (7): ...1, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(OG_DEGFILE)
[1] 22053
deg_table <- data.frame(OG_DEGFILE, row.names = OG_DEGFILE$row )#gene ids stored as row names
geneID<- OG_DEGFILE$row
deg_table <- deg_table[ ,-c(1:2)]
library(biomartr) 
library(tidyverse)
── Attaching core tidyverse packages ───────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.2
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
biomartr::organismBM(organism = "Arabidopsis thaliana")
Starting retrieval of all available BioMart datasets for Arabidopsis thaliana ...
Datasets for the following marts will be retrieved:
Processing mart ENSEMBL_MART_ENSEMBL ...
Starting retrieval of datasets information from mart ENSEMBL_MART_ENSEMBL ...
Processing mart ENSEMBL_MART_MOUSE ...
Starting retrieval of datasets information from mart ENSEMBL_MART_MOUSE ...
Processing mart ENSEMBL_MART_SEQUENCE ...
Starting retrieval of datasets information from mart ENSEMBL_MART_SEQUENCE ...
Processing mart ENSEMBL_MART_ONTOLOGY ...
Starting retrieval of datasets information from mart ENSEMBL_MART_ONTOLOGY ...
Processing mart ENSEMBL_MART_GENOMIC ...
Starting retrieval of datasets information from mart ENSEMBL_MART_GENOMIC ...
Processing mart ENSEMBL_MART_SNP ...
Starting retrieval of datasets information from mart ENSEMBL_MART_SNP ...
Processing mart ENSEMBL_MART_FUNCGEN ...
Starting retrieval of datasets information from mart ENSEMBL_MART_FUNCGEN ...
Processing mart plants_mart ...
Starting retrieval of datasets information from mart plants_mart ...
Processing mart plants_variations ...
Starting retrieval of datasets information from mart plants_variations ...
Processing mart fungi_mart ...
Starting retrieval of datasets information from mart fungi_mart ...
Processing mart fungi_variations ...
Starting retrieval of datasets information from mart fungi_variations ...
Processing mart metazoa_mart ...
Starting retrieval of datasets information from mart metazoa_mart ...
Processing mart protists_mart ...
Starting retrieval of datasets information from mart protists_mart ...
Processing mart protists_variations ...
Starting retrieval of datasets information from mart protists_variations ...
arabido_attributes = 
  biomartr::organismAttributes("Arabidopsis thaliana") %>% 
  filter(dataset == "athaliana_eg_gene")
Starting retrieval of all available BioMart datasets for Arabidopsis thaliana ...


Starting retrieval of all available BioMart attributes for Arabidopsis thaliana ...
Processing mart plants_mart and dataset athaliana_eg_gene ...
Starting retrieval of attributes information from mart plants_mart and dataset athaliana_eg_gene ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
Processing mart plants_variations and dataset athaliana_eg_snp ...
Starting retrieval of attributes information from mart plants_variations and dataset athaliana_eg_snp ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
Processing mart plants_mart and dataset athaliana_eg_gene ...
Starting retrieval of attributes information from mart plants_mart and dataset athaliana_eg_gene ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
Processing mart plants_variations and dataset athaliana_eg_snp ...
Starting retrieval of attributes information from mart plants_variations and dataset athaliana_eg_snp ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
arabido_attributes

attributes_to_retrieve = c("tair_symbol", "entrezgene_id")

result_BM <- biomartr::biomart( genes      = OG_DEGFILE$row,                  # genes were retrieved using biomartr::getGenome()
                                mart       = "plants_mart",                     # marts were selected with biomartr::getMarts()
                                dataset    = "athaliana_eg_gene",               # datasets were selected with biomartr::getDatasets()
                                attributes = attributes_to_retrieve,            # attributes were selected with biomartr::getAttributes()
                                filters =   "ensembl_gene_id" )# query key
Starting BioMart query ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
head(result_BM)
# building the universe!
library(readr)
allgenes <- read_table("featcount.txt", 
                        col_names = TRUE, skip = 1)

── Column specification ──────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  Geneid = col_character(),
  Chr = col_character(),
  Start = col_character(),
  End = col_character(),
  Strand = col_character()
)
ℹ Use `spec()` for the full column specifications.
read.count <- data.frame(allgenes, row.names = allgenes$Geneid ) #gene ids stored as row names
all_arabidopsis_genes <- (read.count$Geneid) # directly selects the gene column

# we want the correspondence of TAIR/Ensembl symbols with NCBI Entrez gene ids
attributes_to_retrieve = c("tair_symbol", "uniprotswissprot","entrezgene_id")

# Query the Ensembl API
all_arabidopsis_genes_annotated <- biomartr::biomart(genes = all_arabidopsis_genes,
                                                     mart       = "plants_mart",                 
                                                     dataset    = "athaliana_eg_gene",           
                                                     attributes = attributes_to_retrieve,        
                                                     filters =  "ensembl_gene_id" )  
Starting BioMart query ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
# for compatibility with enrichGO universe
# genes in the universe need to be characters and not integers (Entrez gene id)
all_arabidopsis_genes_annotated$entrezgene_id = as.character(
  all_arabidopsis_genes_annotated$entrezgene_id) 
library(clusterProfiler)
clusterProfiler v4.10.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

Attaching package: ‘clusterProfiler’

The following object is masked from ‘package:purrr’:

    simplify

The following object is masked from ‘package:AnnotationDbi’:

    select

The following object is masked from ‘package:IRanges’:

    slice

The following object is masked from ‘package:S4Vectors’:

    rename

The following object is masked from ‘package:stats’:

    filter
# retrieving NCBI Entrez gene id for our genes called differential
diff_arabidopsis_genes_annotated <- biomartr::biomart(genes = geneID,
                                                     mart       = "plants_mart",                 
                                                     dataset    = "athaliana_eg_gene",           
                                                     attributes = attributes_to_retrieve,        
                                                     filters =  "ensembl_gene_id" )  
Starting BioMart query ...


Please cite: Drost HG, Paszkowski J. Biomartr: genomic data retrieval with R. Bioinformatics (2017) 33(8): 1216-1217. doi:10.1093/bioinformatics/btw821.
library("biomartr")
library("clusterProfiler")
library("tidyverse")
library("enrichplot")
suppressPackageStartupMessages(library("org.At.tair.db"))
library("biomaRt")  # only use to remove cache bug

# performing the ORA for Gene Ontology Biological Process class
ora_analysis_bp <- enrichGO(gene = diff_arabidopsis_genes_annotated$entrezgene_id, 
                            universe = all_arabidopsis_genes_annotated$entrezgene_id, 
                            OrgDb = org.At.tair.db,  # contains the TAIR/Ensembl id to GO correspondence for A. thaliana
                            keyType = "ENTREZID",
                            ont = "BP",              # either "BP", "CC" or "MF",
                            pAdjustMethod = "BH",
                            pvalueCutoff  = 0.05,
                            qvalueCutoff  = 0.01,
                            readable = TRUE, 
                            pool = FALSE)

# clusterProfiler::simplify to disambiguate which simplify() function you want to use
ora_analysis_bp_simplified <- clusterProfiler::simplify(ora_analysis_bp) 
write_delim(x = as.data.frame(ora_analysis_bp_simplified@result), 
            path = "go_results.tsv", 
            delim = "\t")
Warning: The `path` argument of `write_delim()` is deprecated as of readr 1.4.0.
Please use the `file` argument instead.
# have a look at a few columns and rows
ora_analysis_bp_simplified@result[1:5,1:8]
dotplot(ora_analysis_bp_simplified)

library("enrichplot")
ora_analysis_bp <- pairwise_termsim(ora_analysis_bp, method = "JC")
emapplot(ora_analysis_bp, color = "qvalue")

library("biomartr")
library("clusterProfiler")
library("tidyverse")
library("enrichplot")
suppressPackageStartupMessages(library("org.At.tair.db"))
library("biomaRt")  # only use to remove cache bug


search_kegg_organism('ath', by='kegg_code')
search_kegg_organism('Arabidopsis thaliana', by='scientific_name')


ora_analysis_kegg <- enrichKEGG(gene = diff_arabidopsis_genes_annotated$entrezgene_id,
                                universe = all_arabidopsis_genes_annotated$entrezgene_id,
                                organism = "ath",
                                keyType = "ncbi-geneid",
                                minGSSize = 10,
                                maxGSSize = 500,
                                pAdjustMethod = "BH",
                                qvalueCutoff = 0.01,
                                use_internal_data = FALSE) # force to query latest KEGG db
Reading KEGG annotation online: "https://rest.kegg.jp/link/ath/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/ath"...
Reading KEGG annotation online: "https://rest.kegg.jp/conv/ncbi-geneid/ath"...
head(ora_analysis_kegg)
ora_analysis_kegg_modules <- enrichMKEGG(gene = diff_arabidopsis_genes_annotated$entrezgene_id,
                                         universe = all_arabidopsis_genes_annotated$entrezgene_id,
                                         organism = "ath",
                                         keyType = "ncbi-geneid",
                                         minGSSize = 10,           # minimal size of genes annotated by Ontology term for testing.
                                         maxGSSize = 500,          # maximal size of genes annotated for testing
                                         pAdjustMethod = "BH",
                                         qvalueCutoff = 0.01)
Reading KEGG annotation online: "https://rest.kegg.jp/link/ath/module"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/module"...
Reading KEGG annotation online: "https://rest.kegg.jp/conv/ncbi-geneid/ath"...
browseKEGG(ora_analysis_kegg, 'ath04145')
library("pathview")

##############################################################################
Pathview is an open source software package distributed under GNU General
Public License version 3 (GPLv3). Details of GPLv3 is available at
http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
formally cite the original Pathview paper (not just mention it) in publications
or products. For details, do citation("pathview") within R.

The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
license agreement (details at http://www.kegg.jp/kegg/legal.html).
##############################################################################
ath04145 <- pathview(gene.data  = all_arabidopsis_genes_annotated$entrezgene_id,
                     pathway.id = "ath04145",
                     species    = "ath")
Info: Getting gene ID data from KEGG...
Info: Done with data retrieval!
Info: Working in directory /Users/lilicastillo/Desktop/Lab/Working_Projects/trapseq_2023
Info: Writing image file ath04145.pathview.png
#ggplot(ora_analysis_kegg)
# create a simple dotplot graph
dotplot(ora_analysis_kegg, 
    color = "qvalue", 
    showCategory = 10, 
    size = "Count")

suppressPackageStartupMessages(library("org.At.tair.db"))
library(enrichplot)
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(tidyverse)


#convert gene symbers to ID

#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")

#BiocManager::install(c("AnnotationDbi", "org.At.tair.db", "pathview", "gageData"))

#library(AnnotationDbi)
#library(org.At.tair.db)
#class(org.At.tair.db)
#columns(org.At.tair.db)

#print(res05)
library(readr)
#OG_DEGFILE<- read_delim(file="deg_analysis_DESeq_all.csv", delim = ",")
#nrow(OG_DEGFILE)


ALL_DEG<- read.csv(file = "deg_analysis_DESeq_all.csv")
ALL_DEG <- data.frame(ALL_DEG, row.names = ALL_DEG$row )#gene ids stored as row names
geneID<- ALL_DEG$row
#deg_table <- deg_table[ ,-c(1:2)]

ALL_DEG$symbol = mapIds(org.At.tair.db, 
                    keys= row.names(ALL_DEG), 
                    keytype= "TAIR" , 
                    column= "SYMBOL", 
                    multiVals= "first")

ALL_DEG$entrez = mapIds(org.At.tair.db, 
                    keys=row.names(ALL_DEG), 
                    keytype="TAIR", 
                    column="ENTREZID", 
                    multiVals="first")

deg_table2 <- ALL_DEG[ ,-c(1:2)]

non_duplicates <- which(!duplicated(deg_table2$symbol))
deg_table2 <- deg_table2[non_duplicates, ]
ALL_DEG_tb <- ALL_DEG[ ,-c(1:2)] %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()

w = which(ALL_DEG_tb$gene%in% deg_table2$symbol)
res_ids = ALL_DEG_tb[w,]
m = match(res_ids$gene, deg_table2$symbol)
res_ids$ensgene = deg_table2$entrez[m]
#oRA

## background set of ensgenes
allOE_genes <- ALL_DEG_tb$entrez
sigOE = dplyr::filter(ALL_DEG_tb, padj < 0.05)
sigOE_genes = sigOE$entrez

#Now we can perform the GO enrichment analysis and save the results:
## Run GO enrichment analysis
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENTREZID",
OrgDb = org.At.tair.db,
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)

## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)
write.csv(cluster_summary, "clusterProfiler_oe.csv")

dotplot(ego, showCategory=50)

pwt <- pairwise_termsim(ego,
                        method = "JC",
                        semData = NULL,
                        showCategory = 50)
emapplot(pwt, showCategory = 50)

## To color genes by log2 fold changes, we need to extract the log2 fold changes from o
OE_foldchanges <- sigOE$log2FoldChange
names(OE_foldchanges) <- sigOE$gene

## Cnetplot details the genes associated with one or more terms - by default gives the
cnetplot(ego,
         categorySize="pvalue",
         showCategory = 5,
         foldChange=OE_foldchanges,
         vertex.label.font=6)
Warning: Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
 The foldChange parameter will be removed in the next version.Scale for size is already present.
Adding another scale for size, which will replace the existing scale.

## If some of the high fold changes are getting drowned out due to a large range, you c
OE_foldchanges <- ifelse(OE_foldchanges > 2, 2, OE_foldchanges)
OE_foldchanges <- ifelse(OE_foldchanges < -2, -2, OE_foldchanges)
cnetplot(ego,
         categorySize="pvalue",
         showCategory = 5,
         foldChange=OE_foldchanges,
         vertex.label.font=6)
Warning: Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
 The foldChange parameter will be removed in the next version.Scale for size is already present.
Adding another scale for size, which will replace the existing scale.

## Subsetting the ego results without overwriting original `ego` variable
ego2 <- ego
ego2@result <- ego@result[c(1,3,4,8,9),]
## Plotting terms of interest
cnetplot(ego2,
         categorySize="pvalue",
         foldChange=OE_foldchanges,
         showCategory = 5,
         vertex.label.font=6)
Warning: Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
 The foldChange parameter will be removed in the next version.Scale for size is already present.
Adding another scale for size, which will replace the existing scale.

# Gene set enrichment analysis (GSEA)
m = match(res_ids$ensgene,deg_table2$gene)
res_ids$entrez = deg_table2$entrez[m]
## remove any NA values
wna = which(is.na(res_ids$entrez))
res_entrez = res_ids[-wna,]
## remove any Entrez duplicates
res_entrez <- res_entrez[which(duplicated(res_entrez$entrez) == F), ]
wdup = which(!duplicated(res_entrez$entrez))
res_entrez=res_entrez[wdup,]
## Extract the foldchanges
foldchanges <- res_entrez$log2FoldChange
## Name each fold change with the corresponding Entrez ID
names(foldchanges) <- res_entrez$entrez
## Sort fold changes in decreasing order
foldchanges <- sort(foldchanges, decreasing = TRUE)
head(foldchanges)
named numeric(0)
# GSEA using gene sets associated with BP Gene Ontology terms
gseaGO <- gseGO(geneList = foldchanges,
OrgDb = org.At.tair.db,
ont = 'BP',
nPerm = 1000,
minGSSize = 20,
pvalueCutoff = 0.05,
verbose = FALSE)
--> Expected input gene ID: 832037,827979,834508,827093,819589,839323
Error in check_gene_id(geneList, geneSets) : 
  --> No gene can be mapped....
library(GSEABase)
Loading required package: annotate
Loading required package: XML

Attaching package: 'annotate'

The following object is masked from 'package:biomartr':

    getGO

Loading required package: graph

Attaching package: 'graph'

The following object is masked from 'package:XML':

    addNode

The following object is masked from 'package:stringr':

    boundary
# Load in GMT file of gene sets (we won't download it from the Broad Institute [website](http://sc2 <- read.gmt("data/c2.all.v2023.1.Hs.entrez.gmt")
msig <- GSEA(foldchanges, TERM2GENE=c2, verbose=FALSE)
Error: object 'c2' not found
---
title: "3_Functional_Analysis_Notebook"
output: html_notebook
---



```{r}
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install(c("AnnotationHub", "ensembldb", "tidyverse", "readr")) 


```


```{r}
setwd("~/Desktop/Lab/Working_Projects/trapseq_2023")
library(readr)
OG_DEGFILE<- read_delim(file="deg_analysis_DESeq_all.csv", delim = ",")
nrow(OG_DEGFILE)

deg_table <- data.frame(OG_DEGFILE, row.names = OG_DEGFILE$row )#gene ids stored as row names
geneID<- OG_DEGFILE$row
deg_table <- deg_table[ ,-c(1:2)]
```



```{r}
library(biomartr) 
library(tidyverse)

biomartr::organismBM(organism = "Arabidopsis thaliana")

arabido_attributes = 
  biomartr::organismAttributes("Arabidopsis thaliana") %>% 
  filter(dataset == "athaliana_eg_gene")
arabido_attributes

attributes_to_retrieve = c("tair_symbol", "entrezgene_id")

result_BM <- biomartr::biomart( genes      = OG_DEGFILE$row,                  # genes were retrieved using biomartr::getGenome()
                                mart       = "plants_mart",                     # marts were selected with biomartr::getMarts()
                                dataset    = "athaliana_eg_gene",               # datasets were selected with biomartr::getDatasets()
                                attributes = attributes_to_retrieve,            # attributes were selected with biomartr::getAttributes()
                                filters =   "ensembl_gene_id" )# query key
head(result_BM)
```

```{r}
# building the universe!
library(readr)
allgenes <- read_table("featcount.txt", 
                        col_names = TRUE, skip = 1)
read.count <- data.frame(allgenes, row.names = allgenes$Geneid ) #gene ids stored as row names
all_arabidopsis_genes <- (read.count$Geneid) # directly selects the gene column

# we want the correspondence of TAIR/Ensembl symbols with NCBI Entrez gene ids
attributes_to_retrieve = c("tair_symbol", "uniprotswissprot","entrezgene_id")

# Query the Ensembl API
all_arabidopsis_genes_annotated <- biomartr::biomart(genes = all_arabidopsis_genes,
                                                     mart       = "plants_mart",                 
                                                     dataset    = "athaliana_eg_gene",           
                                                     attributes = attributes_to_retrieve,        
                                                     filters =  "ensembl_gene_id" )  

# for compatibility with enrichGO universe
# genes in the universe need to be characters and not integers (Entrez gene id)
all_arabidopsis_genes_annotated$entrezgene_id = as.character(
  all_arabidopsis_genes_annotated$entrezgene_id) 
```
```{r}
library(clusterProfiler)
# retrieving NCBI Entrez gene id for our genes called differential
diff_arabidopsis_genes_annotated <- biomartr::biomart(genes = geneID,
                                                     mart       = "plants_mart",                 
                                                     dataset    = "athaliana_eg_gene",           
                                                     attributes = attributes_to_retrieve,        
                                                     filters =  "ensembl_gene_id" )  

```


```{r}
library("biomartr")
library("clusterProfiler")
library("tidyverse")
library("enrichplot")
suppressPackageStartupMessages(library("org.At.tair.db"))
library("biomaRt")  # only use to remove cache bug

# performing the ORA for Gene Ontology Biological Process class
ora_analysis_bp <- enrichGO(gene = diff_arabidopsis_genes_annotated$entrezgene_id, 
                            universe = all_arabidopsis_genes_annotated$entrezgene_id, 
                            OrgDb = org.At.tair.db,  # contains the TAIR/Ensembl id to GO correspondence for A. thaliana
                            keyType = "ENTREZID",
                            ont = "BP",              # either "BP", "CC" or "MF",
                            pAdjustMethod = "BH",
                            pvalueCutoff  = 0.05,
                            qvalueCutoff  = 0.01,
                            readable = TRUE, 
                            pool = FALSE)

# clusterProfiler::simplify to disambiguate which simplify() function you want to use
ora_analysis_bp_simplified <- clusterProfiler::simplify(ora_analysis_bp) 
```


```{r}
write_delim(x = as.data.frame(ora_analysis_bp_simplified@result), 
            path = "go_results.tsv", 
            delim = "\t")

# have a look at a few columns and rows
ora_analysis_bp_simplified@result[1:5,1:8]
```


```{r}
dotplot(ora_analysis_bp_simplified)
```


```{r}
library("enrichplot")
ora_analysis_bp <- pairwise_termsim(ora_analysis_bp, method = "JC")
emapplot(ora_analysis_bp, color = "qvalue")
```


```{r}
library("biomartr")
library("clusterProfiler")
library("tidyverse")
library("enrichplot")
suppressPackageStartupMessages(library("org.At.tair.db"))
library("biomaRt")  # only use to remove cache bug


search_kegg_organism('ath', by='kegg_code')
search_kegg_organism('Arabidopsis thaliana', by='scientific_name')


ora_analysis_kegg <- enrichKEGG(gene = diff_arabidopsis_genes_annotated$entrezgene_id,
                                universe = all_arabidopsis_genes_annotated$entrezgene_id,
                                organism = "ath",
                                keyType = "ncbi-geneid",
                                minGSSize = 10,
                                maxGSSize = 500,
                                pAdjustMethod = "BH",
                                qvalueCutoff = 0.01,
                                use_internal_data = FALSE) # force to query latest KEGG db
head(ora_analysis_kegg)
ora_analysis_kegg_modules <- enrichMKEGG(gene = diff_arabidopsis_genes_annotated$entrezgene_id,
                                         universe = all_arabidopsis_genes_annotated$entrezgene_id,
                                         organism = "ath",
                                         keyType = "ncbi-geneid",
                                         minGSSize = 10,           # minimal size of genes annotated by Ontology term for testing.
                                         maxGSSize = 500,          # maximal size of genes annotated for testing
                                         pAdjustMethod = "BH",
                                         qvalueCutoff = 0.01)

browseKEGG(ora_analysis_kegg, 'ath04145')
library("pathview")
ath04145 <- pathview(gene.data  = all_arabidopsis_genes_annotated$entrezgene_id,
                     pathway.id = "ath04145",
                     species    = "ath")
#ggplot(ora_analysis_kegg)
```


```{r}
# create a simple dotplot graph
dotplot(ora_analysis_kegg, 
    color = "qvalue", 
    showCategory = 10, 
    size = "Count")

```

```{r}
suppressPackageStartupMessages(library("org.At.tair.db"))
library(enrichplot)
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(tidyverse)


#convert gene symbers to ID

#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")

#BiocManager::install(c("AnnotationDbi", "org.At.tair.db", "pathview", "gageData"))

#library(AnnotationDbi)
#library(org.At.tair.db)
#class(org.At.tair.db)
#columns(org.At.tair.db)

#print(res05)
library(readr)
#OG_DEGFILE<- read_delim(file="deg_analysis_DESeq_all.csv", delim = ",")
#nrow(OG_DEGFILE)


ALL_DEG<- read.csv(file = "deg_analysis_DESeq_all.csv")
ALL_DEG <- data.frame(ALL_DEG, row.names = ALL_DEG$row )#gene ids stored as row names
geneID<- ALL_DEG$row
#deg_table <- deg_table[ ,-c(1:2)]

ALL_DEG$symbol = mapIds(org.At.tair.db, 
                    keys= row.names(ALL_DEG), 
                    keytype= "TAIR" , 
                    column= "SYMBOL", 
                    multiVals= "first")

ALL_DEG$entrez = mapIds(org.At.tair.db, 
                    keys=row.names(ALL_DEG), 
                    keytype="TAIR", 
                    column="ENTREZID", 
                    multiVals="first")

deg_table2 <- ALL_DEG[ ,-c(1:2)]

non_duplicates <- which(!duplicated(deg_table2$symbol))
deg_table2 <- deg_table2[non_duplicates, ]
```

```{r}
ALL_DEG_tb <- ALL_DEG[ ,-c(1:2)] %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()

w = which(ALL_DEG_tb$gene%in% deg_table2$symbol)
res_ids = ALL_DEG_tb[w,]
m = match(res_ids$gene, deg_table2$symbol)
res_ids$ensgene = deg_table2$entrez[m]
```

```{r}
#oRA

## background set of ensgenes
allOE_genes <- ALL_DEG_tb$entrez
sigOE = dplyr::filter(ALL_DEG_tb, padj < 0.05)
sigOE_genes = sigOE$entrez

#Now we can perform the GO enrichment analysis and save the results:
## Run GO enrichment analysis
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENTREZID",
OrgDb = org.At.tair.db,
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)

## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)
write.csv(cluster_summary, "clusterProfiler_oe.csv")

dotplot(ego, showCategory=50)
```

```{r}
pwt <- pairwise_termsim(ego,
                        method = "JC",
                        semData = NULL,
                        showCategory = 50)
emapplot(pwt, showCategory = 50)
```
```{r}
## To color genes by log2 fold changes, we need to extract the log2 fold changes from o
OE_foldchanges <- sigOE$log2FoldChange
names(OE_foldchanges) <- sigOE$gene

## Cnetplot details the genes associated with one or more terms - by default gives the
cnetplot(ego,
         categorySize="pvalue",
         showCategory = 5,
         foldChange=OE_foldchanges,
         vertex.label.font=6)

## If some of the high fold changes are getting drowned out due to a large range, you c
OE_foldchanges <- ifelse(OE_foldchanges > 2, 2, OE_foldchanges)
OE_foldchanges <- ifelse(OE_foldchanges < -2, -2, OE_foldchanges)
cnetplot(ego,
         categorySize="pvalue",
         showCategory = 5,
         foldChange=OE_foldchanges,
         vertex.label.font=6)
```

```{r}
## Subsetting the ego results without overwriting original `ego` variable
ego2 <- ego
ego2@result <- ego@result[c(1,3,4,8,9),]
## Plotting terms of interest
cnetplot(ego2,
         categorySize="pvalue",
         foldChange=OE_foldchanges,
         showCategory = 5,
         vertex.label.font=6)
```
```{r}
# Gene set enrichment analysis (GSEA)
m = match(res_ids$ensgene,deg_table2$gene)
res_ids$entrez = deg_table2$entrez[m]
## remove any NA values
wna = which(is.na(res_ids$entrez))
res_entrez = res_ids[-wna,]
## remove any Entrez duplicates
res_entrez <- res_entrez[which(duplicated(res_entrez$entrez) == F), ]
wdup = which(!duplicated(res_entrez$entrez))
res_entrez=res_entrez[wdup,]
```

```{r}
## Extract the foldchanges
foldchanges <- res_entrez$log2FoldChange
## Name each fold change with the corresponding Entrez ID
names(foldchanges) <- res_entrez$entrez
```

```{r}
## Sort fold changes in decreasing order
foldchanges <- sort(foldchanges, decreasing = TRUE)
head(foldchanges)
```

```{r}
# GSEA using gene sets associated with BP Gene Ontology terms
gseaGO <- gseGO(geneList = foldchanges,
OrgDb = org.At.tair.db,
ont = 'BP',
nPerm = 1000,
minGSSize = 20,
pvalueCutoff = 0.05,
verbose = FALSE)
gseaGO_results <- gseaGO@result
gseaplot(gseaGO, geneSetID = 'GO:0007423')
```

```{r}
library(GSEABase)
# Load in GMT file of gene sets (we won't download it from the Broad Institute [website](http://sc2 <- read.gmt("data/c2.all.v2023.1.Hs.entrez.gmt")
msig <- GSEA(foldchanges, TERM2GENE=c2, verbose=FALSE)
msig_df <- data.frame(msig)
```

